Personal Assignment 2

Author

Landon Carpenter

Published

October 17, 2023

import random 
from seaborn.palettes import color_palette
random.seed(1276)
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
#load and show the first 6 points
card = pd.read_csv('creditcard.csv')
card.head(6)
Time V1 V2 V3 V4 V5 V6 V7 V8 V9 ... V21 V22 V23 V24 V25 V26 V27 V28 Amount Class
0 0.0 -1.359807 -0.072781 2.536347 1.378155 -0.338321 0.462388 0.239599 0.098698 0.363787 ... -0.018307 0.277838 -0.110474 0.066928 0.128539 -0.189115 0.133558 -0.021053 149.62 0
1 0.0 1.191857 0.266151 0.166480 0.448154 0.060018 -0.082361 -0.078803 0.085102 -0.255425 ... -0.225775 -0.638672 0.101288 -0.339846 0.167170 0.125895 -0.008983 0.014724 2.69 0
2 1.0 -1.358354 -1.340163 1.773209 0.379780 -0.503198 1.800499 0.791461 0.247676 -1.514654 ... 0.247998 0.771679 0.909412 -0.689281 -0.327642 -0.139097 -0.055353 -0.059752 378.66 0
3 1.0 -0.966272 -0.185226 1.792993 -0.863291 -0.010309 1.247203 0.237609 0.377436 -1.387024 ... -0.108300 0.005274 -0.190321 -1.175575 0.647376 -0.221929 0.062723 0.061458 123.50 0
4 2.0 -1.158233 0.877737 1.548718 0.403034 -0.407193 0.095921 0.592941 -0.270533 0.817739 ... -0.009431 0.798278 -0.137458 0.141267 -0.206010 0.502292 0.219422 0.215153 69.99 0
5 2.0 -0.425966 0.960523 1.141109 -0.168252 0.420987 -0.029728 0.476201 0.260314 -0.568671 ... -0.208254 -0.559825 -0.026398 -0.371427 -0.232794 0.105915 0.253844 0.081080 3.67 0

6 rows × 31 columns

# 2. Describe the data
card.describe()
Time V1 V2 V3 V4 V5 V6 V7 V8 V9 ... V21 V22 V23 V24 V25 V26 V27 V28 Amount Class
count 284807.000000 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 ... 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 284807.000000 284807.000000
mean 94813.859575 1.168375e-15 3.416908e-16 -1.379537e-15 2.074095e-15 9.604066e-16 1.487313e-15 -5.556467e-16 1.213481e-16 -2.406331e-15 ... 1.654067e-16 -3.568593e-16 2.578648e-16 4.473266e-15 5.340915e-16 1.683437e-15 -3.660091e-16 -1.227390e-16 88.349619 0.001727
std 47488.145955 1.958696e+00 1.651309e+00 1.516255e+00 1.415869e+00 1.380247e+00 1.332271e+00 1.237094e+00 1.194353e+00 1.098632e+00 ... 7.345240e-01 7.257016e-01 6.244603e-01 6.056471e-01 5.212781e-01 4.822270e-01 4.036325e-01 3.300833e-01 250.120109 0.041527
min 0.000000 -5.640751e+01 -7.271573e+01 -4.832559e+01 -5.683171e+00 -1.137433e+02 -2.616051e+01 -4.355724e+01 -7.321672e+01 -1.343407e+01 ... -3.483038e+01 -1.093314e+01 -4.480774e+01 -2.836627e+00 -1.029540e+01 -2.604551e+00 -2.256568e+01 -1.543008e+01 0.000000 0.000000
25% 54201.500000 -9.203734e-01 -5.985499e-01 -8.903648e-01 -8.486401e-01 -6.915971e-01 -7.682956e-01 -5.540759e-01 -2.086297e-01 -6.430976e-01 ... -2.283949e-01 -5.423504e-01 -1.618463e-01 -3.545861e-01 -3.171451e-01 -3.269839e-01 -7.083953e-02 -5.295979e-02 5.600000 0.000000
50% 84692.000000 1.810880e-02 6.548556e-02 1.798463e-01 -1.984653e-02 -5.433583e-02 -2.741871e-01 4.010308e-02 2.235804e-02 -5.142873e-02 ... -2.945017e-02 6.781943e-03 -1.119293e-02 4.097606e-02 1.659350e-02 -5.213911e-02 1.342146e-03 1.124383e-02 22.000000 0.000000
75% 139320.500000 1.315642e+00 8.037239e-01 1.027196e+00 7.433413e-01 6.119264e-01 3.985649e-01 5.704361e-01 3.273459e-01 5.971390e-01 ... 1.863772e-01 5.285536e-01 1.476421e-01 4.395266e-01 3.507156e-01 2.409522e-01 9.104512e-02 7.827995e-02 77.165000 0.000000
max 172792.000000 2.454930e+00 2.205773e+01 9.382558e+00 1.687534e+01 3.480167e+01 7.330163e+01 1.205895e+02 2.000721e+01 1.559499e+01 ... 2.720284e+01 1.050309e+01 2.252841e+01 4.584549e+00 7.519589e+00 3.517346e+00 3.161220e+01 3.384781e+01 25691.160000 1.000000

8 rows × 31 columns

# #show the type of each column
# card.dtypes
#show a correlation heat plot of the data
import seaborn as sns
plt.figure(figsize=(20,10))
sns.heatmap(card.corr(), cmap="YlGnBu")
plt.show()

#take a sample for the scatter matrix
card_sample = card.sample(frac=.15)
v_cols = [f"V{i}" for i in range(1,29, 4)]

my_cols = ['Time', 'Amount', 'Class'] + v_cols
my_sample = card_sample[my_cols]
#4 Scatterplot matrix
import plotly.express as px
import plotly.io as pio
fig = px.scatter_matrix(my_sample, color="Class", title="Scatterplot Matrix of Time, Amount, Class, and Every 4th V Column")
fig.update_traces(marker=dict(size=2), selector=dict(diagonal='histogram'), showupperhalf=False)
fig.update_layout(font=dict(size=7, color='black'))
pio.write_html(fig, file='scatterplot_matrix.html')
fig.show()
#split the data into train and test, sending card['Class'] to y and leaving test as .2
from sklearn.model_selection import train_test_split

X = card.drop('Class', axis=1)
y = card['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1276)
# y.head(6)

Explanation of Commented Code

The portions of code for training the model are commented out because they take 9ish hours to run. I have saved the results to a csv and loaded them back in to make the boxplot. Additionally, two models have been saved and will be loaded in to complete the notebook.

#6 Perform classification using methods mentioned in word doc
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score, RepeatedKFold
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from lightgbm import LGBMRegressor
import torch

# #I dont think sklearn supports gpu acceleration?
# #device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# def train_eval(X_train, y_train, X_test, y_test):
#     regressors = {
#         'Linear Regression': LinearRegression(),
#         'Decision Tree': DecisionTreeRegressor(), 
#         'Random Forest': RandomForestRegressor(), 
#         'K-Nearest Neighbors': KNeighborsRegressor(), 
#         'Bagging': BaggingRegressor(), 
#         'Gradient Boosting': GradientBoostingRegressor(),
#         'LightGBM': LGBMRegressor(),
#         'Ridge': Ridge(),
#         'Lasso': Lasso()}

#     results, names = [], []

#     for name, method in regressors.items():
#       cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=1276)
#       scores = -cross_val_score(method, X_train, y_train, scoring= 'neg_mean_absolute_error', cv=cv, n_jobs=-1)
#       results.append(scores)
#       names.append(name)
#     return results, names

# results, model_names = train_eval(X_train, y_train, X_test, y_test)
# regressmod = pd.DataFrame(np.transpose(results), columns=model_names)
# regressmod = pd.melt(regressmod.reset_index(), id_vars='index', value_vars=model_names)
# print(type(regressmod))
# print(regressmod.groupby('variable').mean())
# #save regressmod to csv
# regressmod.to_csv('new_regressmod.csv')
#load in the csv
regressmod = pd.read_csv('new_regressmod.csv')
#show box plot for results
def box_results(the_results):
    #use px to make a box plot of the results from regressmod
    fig = px.box(the_results, x="variable", y="value", title="Box Plot of Regression Models")
    fig.show()

box_results(regressmod)

This is a little more formal of a way to confirm what I’m seeing in the boxplots. Actually looks like I cant use Logistic Regression. So I’ll take it out

# sorted_regressors = sorted(zip(model_names, results), key=lambda x: np.mean(x[1]))
# top_models = sorted_regressors[:4]

# print([name for name, _ in top_models])
level0 = list()
level0.append(('bag', BaggingRegressor()))
level0.append(('dt', DecisionTreeRegressor()))
level0.append(('rf', RandomForestRegressor()))
level0.append(('knn', KNeighborsRegressor()))
print(level0)
[('bag', BaggingRegressor()), ('dt', DecisionTreeRegressor()), ('rf', RandomForestRegressor()), ('knn', KNeighborsRegressor())]
from lightgbm import LGBMRegressor
from sklearn.ensemble import StackingRegressor
# level1 = LGBMRegressor()
# stacked = StackingRegressor(estimators=level0, final_estimator=level1, cv=3)
# stacked.fit(X_train, y_train)
import pickle 

# #8 export and import 
# with open('new_stacked_model.pkl', 'wb') as f:
#     pickle.dump(stacked, f)
# with open('new_stacked_model.pkl', 'rb') as f:
#     loaded_model = pickle.load(f)
#import the model back in as stacked 
loaded_model = pickle.load(open('new_stacked_model.pkl', 'rb'))
#9
#make predictions with the imported model
y_pred = loaded_model.predict(X_test)
#get an the mse for the predictions
mse = mean_squared_error(y_test, y_pred)
print('Mean Squared Error: %.5f' % mse)
Mean Squared Error: 0.00035
#9 
#make a scatter plot of the predictions
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=.5, edgecolors='w', linewidths=.25)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Actual vs Predicted')

#add a grid
plt.grid(True, color='black', alpha=.25, linewidth=.25)

plt.tight_layout()
plt.show()